library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(MatchIt)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(corrplot)
## corrplot 0.84 loaded
library(knitr)
opts_chunk$set(echo = TRUE)
select all adopter assign to a new data frame “premium” select all non-adopter assign to a new data frame “free” deleting ID from adopter and non adopter since it’s not a key variable
getwd()
## [1] "/Users/yuziliu/Downloads"
Highnote <- read.csv("HighNote Data Midterm.csv", header = TRUE)
premium <- subset(Highnote, adopter == 1)
free <- subset(Highnote, adopter == 0)
premium$ID <- NULL
free$ID <- NULL
Generate descriptive statistics for the key variables in the data set summary the discriptive statistics of adopter and non-adopter
stargazer(premium, type="text", median=TRUE, digits=2, title="adopters summary")
##
## adopters summary
## =====================================================================================
## Statistic N Mean St. Dev. Min Pctl(25) Median Pctl(75) Max
## -------------------------------------------------------------------------------------
## age 3,527 25.98 6.84 8 21 24 29 73
## male 3,527 0.73 0.44 0 0 1 1 1
## friend_cnt 3,527 39.73 117.27 1 7 16 40 5,089
## avg_friend_age 3,527 25.44 5.21 12 22.1 24.4 27.6 62
## avg_friend_male 3,527 0.64 0.25 0.00 0.50 0.67 0.81 1.00
## friend_country_cnt 3,527 7.19 8.86 0 2 4 9 136
## subscriber_friend_cnt 3,527 1.64 5.85 0 0 0 2 287
## songsListened 3,527 33,758.04 43,592.73 0 7,804.5 20,908 43,989.5 817,290
## lovedTracks 3,527 264.34 491.43 0 30 108 292 10,220
## posts 3,527 21.20 221.99 0 0 0 2 8,506
## playlists 3,527 0.90 2.56 0 0 1 1 118
## shouts 3,527 99.44 1,156.07 0 2 9 41 65,872
## adopter 3,527 1.00 0.00 1 1 1 1 1
## tenure 3,527 45.58 20.04 0 32 46 60 111
## good_country 3,527 0.29 0.45 0 0 0 1 1
## -------------------------------------------------------------------------------------
stargazer(free, type="text", median=TRUE, digits=2, title="Non-adopters summary")
##
## Non-adopters summary
## ========================================================================================
## Statistic N Mean St. Dev. Min Pctl(25) Median Pctl(75) Max
## ----------------------------------------------------------------------------------------
## age 40,300 23.95 6.37 8 20 23 26 79
## male 40,300 0.62 0.48 0 0 1 1 1
## friend_cnt 40,300 18.49 57.48 1 3 7 18 4,957
## avg_friend_age 40,300 24.01 5.10 8.00 20.67 23.00 26.06 77.00
## avg_friend_male 40,300 0.62 0.32 0.00 0.43 0.67 0.90 1.00
## friend_country_cnt 40,300 3.96 5.76 0 1 2 4 129
## subscriber_friend_cnt 40,300 0.42 2.42 0 0 0 0 309
## songsListened 40,300 17,589.44 28,416.02 0 1,252 7,440 22,892.8 1,000,000
## lovedTracks 40,300 86.82 263.58 0 1 14 72 12,522
## posts 40,300 5.29 104.31 0 0 0 0 12,309
## playlists 40,300 0.55 1.07 0 0 0 1 98
## shouts 40,300 29.97 150.69 0 1 4 15 7,736
## adopter 40,300 0.00 0.00 0 0 0 0 0
## tenure 40,300 43.81 19.79 1 29 44 59 111
## good_country 40,300 0.36 0.48 0 0 0 1 1
## ----------------------------------------------------------------------------------------
do t-tests to compare difference in mean values of the variables for adopter and non-adopter
hn_cov <- c('age', 'male', 'friend_cnt', 'avg_friend_age', 'avg_friend_male', 'friend_country_cnt',
'subscriber_friend_cnt', 'songsListened', 'lovedTracks', 'posts', 'playlists',
'shouts', 'tenure', 'good_country')
Highnote %>%
group_by(adopter) %>%
select(one_of(hn_cov)) %>%
summarise_all(funs(mean(., na.rm = T)))
## Adding missing grouping variables: `adopter`
lapply(hn_cov, function(v) {
t.test(Highnote[, v] ~ Highnote$adopter)
})
## [[1]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote$adopter
## t = -16.996, df = 4079.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.265768 -1.797097
## sample estimates:
## mean in group 0 mean in group 1
## 23.94844 25.97987
##
##
## [[2]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote$adopter
## t = -13.654, df = 4295, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.12278707 -0.09195413
## sample estimates:
## mean in group 0 mean in group 1
## 0.6218610 0.7292316
##
##
## [[3]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote$adopter
## t = -10.646, df = 3675.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -25.15422 -17.32999
## sample estimates:
## mean in group 0 mean in group 1
## 18.49166 39.73377
##
##
## [[4]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote$adopter
## t = -15.658, df = 4140.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.608931 -1.250852
## sample estimates:
## mean in group 0 mean in group 1
## 24.01142 25.44131
##
##
## [[5]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote$adopter
## t = -4.4426, df = 4591.6, p-value = 9.097e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.02883955 -0.01117951
## sample estimates:
## mean in group 0 mean in group 1
## 0.6165888 0.6365983
##
##
## [[6]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote$adopter
## t = -21.267, df = 3791.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.528795 -2.933081
## sample estimates:
## mean in group 0 mean in group 1
## 3.957891 7.188829
##
##
## [[7]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote$adopter
## t = -12.287, df = 3632.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.413899 -1.024766
## sample estimates:
## mean in group 0 mean in group 1
## 0.417469 1.636802
##
##
## [[8]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote$adopter
## t = -21.629, df = 3792.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -17634.24 -14702.96
## sample estimates:
## mean in group 0 mean in group 1
## 17589.44 33758.04
##
##
## [[9]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote$adopter
## t = -21.188, df = 3705.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -193.9447 -161.0917
## sample estimates:
## mean in group 0 mean in group 1
## 86.82263 264.34080
##
##
## [[10]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote$adopter
## t = -4.2151, df = 3663.5, p-value = 2.557e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -23.30665 -8.50825
## sample estimates:
## mean in group 0 mean in group 1
## 5.293002 21.200454
##
##
## [[11]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote$adopter
## t = -8.0816, df = 3634.7, p-value = 8.619e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.4367565 -0.2662138
## sample estimates:
## mean in group 0 mean in group 1
## 0.5492804 0.9007655
##
##
## [[12]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote$adopter
## t = -3.5659, df = 3536.5, p-value = 0.0003674
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -107.66170 -31.27249
## sample estimates:
## mean in group 0 mean in group 1
## 29.97266 99.43975
##
##
## [[13]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote$adopter
## t = -5.0434, df = 4150.6, p-value = 4.768e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.462620 -1.083959
## sample estimates:
## mean in group 0 mean in group 1
## 43.80993 45.58322
##
##
## [[14]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote$adopter
## t = 8.8009, df = 4248.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.05463587 0.08595434
## sample estimates:
## mean in group 0 mean in group 1
## 0.3577916 0.2874965
We can see that the mean difference of all covariates are significant. From these comparisons, we can make a tenative conclusion that: * Users who are older, male, and their friends are older, tend to become fee-users * Users who have more friends, and more friends from different countries, tend to become fee-users. * Users who have more premium friends, tend to become fee-users. * Users who are more engaging (listened more songs, loved more tracks, made more posts and playlists, received more shouts, been on the site longer) are more likely to become fee-users.
plot1 <- ggplot(data = Highnote,
mapping = aes(x = age)) +
geom_density() +
facet_wrap(~ adopter)
plot2 <- ggplot(data = Highnote,
mapping = aes(x = male)) +
geom_histogram() +
facet_wrap(~ adopter)
grid.arrange(plot1, plot2, ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = Highnote,
mapping = aes(x = good_country)) +
geom_histogram() +
facet_wrap(~ adopter)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot3 <- ggplot(data = Highnote,
mapping = aes(x = friend_cnt)) +
geom_density() +
facet_wrap(~ adopter)
plot4 <- ggplot(data = Highnote,
mapping = aes(x = avg_friend_age)) +
geom_density() +
facet_wrap(~ adopter)
grid.arrange(plot3, plot4, ncol=2)
plot5 <- ggplot(data = Highnote,
mapping = aes(x = avg_friend_male)) +
geom_histogram() +
facet_wrap(~ adopter)
plot6 <- ggplot(data = Highnote,
mapping = aes(x = friend_country_cnt)) +
geom_density() +
facet_wrap(~ adopter)
grid.arrange(plot5, plot6, ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = Highnote,
mapping = aes(x = subscriber_friend_cnt)) +
geom_density() +
facet_wrap(~ adopter)
plot7 <- ggplot(data = Highnote,
mapping = aes(x = songsListened)) +
geom_density() +
facet_wrap(~ adopter)
plot8 <- ggplot(data = Highnote,
mapping = aes(x = lovedTracks)) +
geom_density() +
facet_wrap(~ adopter)
grid.arrange(plot7, plot8, ncol=2)
plot9 <- ggplot(data = Highnote,
mapping = aes(x = posts)) +
geom_density() +
facet_wrap(~ adopter)
plot10 <- ggplot(data = Highnote,
mapping = aes(x = playlists)) +
geom_density() +
facet_wrap(~ adopter)
grid.arrange(plot9, plot10, ncol=2)
ggplot(data = Highnote,
mapping = aes(x = tenure)) +
geom_density() +
facet_wrap(~ adopter)
From the visualization, we can make same conclusion as the mean difference analysis that: * Users who are older, male, and their friends are older, tend to become fee-users Users who have more friends, and more friends from different countries, tend to become fee-users. * Users who have more premium friends, tend to become fee-users. * Users who are more engaging (listened more songs, loved more tracks, made more posts and playlists, received more shouts, been on the site longer) are more likely to become fee-users.
create treatment and control groups * “treatment” group: users that have one or more subscriber friends (subscriber_friend_cnt >= 1) * “control” group: users with zero subscriber friends (subscriber_friend_cnt = 0)
Highnote$ynsf = ifelse(Highnote$subscriber_friend_cnt >= 1, 1, 0)
*1.1: Difference-in-means: outcome variable Using adopter as the outcome variable of interest. (1 = adopter; 0 = non-adopter), the independent variable of interest is ynsf. (1 = having subcriber friends; 0 = not having)
with(Highnote, t.test(adopter ~ ynsf))
##
## Welch Two Sample t-test
##
## data: adopter by ynsf
## t = -30.961, df = 11815, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1330281 -0.1171869
## sample estimates:
## mean in group 0 mean in group 1
## 0.05243501 0.17754250
We see that the difference-in-means is statistically significant at conventional levels of confidence.
hn_cov2 <- c('age', 'male', 'friend_cnt', 'avg_friend_age', 'avg_friend_male', 'friend_country_cnt',
'songsListened', 'lovedTracks', 'posts', 'playlists',
'shouts', 'tenure', 'good_country')
Highnote %>%
group_by(ynsf) %>%
select(one_of(hn_cov2)) %>%
summarise_all(funs(mean(., na.rm = T)))
## Adding missing grouping variables: `ynsf`
Then we can carry out t-tests to evaluate whether these means are statistically distinguishable:
lapply(hn_cov2, function(v) {
t.test(Highnote[, v] ~ Highnote[, 'ynsf'])
})
## [[1]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote[, "ynsf"]
## t = -20.841, df = 14645, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.778544 -1.472749
## sample estimates:
## mean in group 0 mean in group 1
## 23.74756 25.37321
##
##
## [[2]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote[, "ynsf"]
## t = -1.3459, df = 15986, p-value = 0.1784
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.018236129 0.003388028
## sample estimates:
## mean in group 0 mean in group 1
## 0.6288378 0.6362618
##
##
## [[3]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote[, "ynsf"]
## t = -33.707, df = 9903.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -46.12459 -41.05469
## sample estimates:
## mean in group 0 mean in group 1
## 10.43133 54.02097
##
##
## [[4]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote[, "ynsf"]
## t = -27.658, df = 15667, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.744514 -1.513611
## sample estimates:
## mean in group 0 mean in group 1
## 23.76137 25.39043
##
##
## [[5]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote[, "ynsf"]
## t = -7.7114, df = 23020, p-value = 1.294e-14
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.02846397 -0.01692672
## sample estimates:
## mean in group 0 mean in group 1
## 0.6131124 0.6358077
##
##
## [[6]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote[, "ynsf"]
## t = -65.05, df = 10372, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.861271 -6.459857
## sample estimates:
## mean in group 0 mean in group 1
## 2.725062 9.385626
##
##
## [[7]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote[, "ynsf"]
## t = -41.505, df = 11447, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -20037.04 -18229.80
## sample estimates:
## mean in group 0 mean in group 1
## 14602.22 33735.64
##
##
## [[8]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote[, "ynsf"]
## t = -31.265, df = 10585, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -170.1918 -150.1102
## sample estimates:
## mean in group 0 mean in group 1
## 65.21365 225.36465
##
##
## [[9]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote[, "ynsf"]
## t = -7.3649, df = 9933.6, p-value = 1.914e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -22.76492 -13.19424
## sample estimates:
## mean in group 0 mean in group 1
## 2.543377 20.522956
##
##
## [[10]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote[, "ynsf"]
## t = -10.492, df = 11238, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2546958 -0.1745100
## sample estimates:
## mean in group 0 mean in group 1
## 0.5294671 0.7440700
##
##
## [[11]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote[, "ynsf"]
## t = -11.426, df = 9888.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -100.04703 -70.74591
## sample estimates:
## mean in group 0 mean in group 1
## 16.42304 101.81951
##
##
## [[12]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote[, "ynsf"]
## t = -14.696, df = 15805, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.792309 -2.899752
## sample estimates:
## mean in group 0 mean in group 1
## 43.20268 46.54871
##
##
## [[13]]
##
## Welch Two Sample t-test
##
## data: Highnote[, v] by Highnote[, "ynsf"]
## t = 2.0956, df = 16030, p-value = 0.03613
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.0007383591 0.0220968020
## sample estimates:
## mean in group 0 mean in group 1
## 0.3546936 0.3432760
We see that except for ‘male’, all mean value of other variables are statistically distinguishable. We should then exclude ‘male’ in the PSM logit model.
We estimate the propensity score by running a logit model, where the outcome variable is a binary variable indicating treatment status.
Highnote <- Highnote %>% mutate(songsListened_1k = songsListened / 1000)
h_ps <- glm(ynsf ~ age + friend_cnt + avg_friend_age + avg_friend_male + friend_country_cnt
+ songsListened_1k + lovedTracks + posts + playlists
+ shouts + tenure + good_country, family = binomial(), data = Highnote)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(h_ps)
##
## Call:
## glm(formula = ynsf ~ age + friend_cnt + avg_friend_age + avg_friend_male +
## friend_country_cnt + songsListened_1k + lovedTracks + posts +
## playlists + shouts + tenure + good_country, family = binomial(),
## data = Highnote)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.4154 -0.5668 -0.4221 -0.3009 2.5520
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.124e+00 7.566e-02 -67.720 < 2e-16 ***
## age 2.043e-02 2.757e-03 7.409 1.27e-13 ***
## friend_cnt 3.131e-02 1.033e-03 30.295 < 2e-16 ***
## avg_friend_age 7.904e-02 3.460e-03 22.843 < 2e-16 ***
## avg_friend_male 2.528e-01 5.027e-02 5.030 4.92e-07 ***
## friend_country_cnt 1.105e-01 4.751e-03 23.266 < 2e-16 ***
## songsListened_1k 7.012e-03 5.107e-04 13.731 < 2e-16 ***
## lovedTracks 6.685e-04 5.644e-05 11.845 < 2e-16 ***
## posts 5.753e-04 2.686e-04 2.142 0.0322 *
## playlists 5.249e-03 1.191e-02 0.441 0.6593
## shouts -5.027e-05 3.678e-05 -1.367 0.1717
## tenure -2.534e-03 7.766e-04 -3.262 0.0011 **
## good_country 3.088e-02 2.921e-02 1.057 0.2903
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 46640 on 43826 degrees of freedom
## Residual deviance: 34173 on 43814 degrees of freedom
## AIC: 34199
##
## Number of Fisher Scoring iterations: 8
After that, we calculate the propensity score for each user. That is, the user’s predicted probability of being Treated, given the estimates from the logit model.
prs_df <- data.frame(pr_score = predict(h_ps, type = "response"),
ynsf = h_ps$model$ynsf)
head(prs_df)
*2.1 Examining the region of common support We can plot histograms of the estimated propensity scores by treatment status:
labs <- paste("Actual fee friends:", c("Have", "Don't have"))
prs_df %>%
mutate(ynsf = ifelse(ynsf == 1, labs[1], labs[2])) %>%
ggplot(aes(x = pr_score)) +
geom_histogram(color = "white") +
facet_wrap(~ynsf) +
xlab("Probability of having fee friends") +
theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We find pairs of observations that have very similar propensity scores, but that differ in their treatment status.
Highnote_nomiss <- Highnote %>% # MatchIt does not allow missing values
select(adopter, ynsf, one_of(hn_cov2)) %>%
na.omit()
mod_match <- matchit(ynsf ~ age + friend_cnt + avg_friend_age + avg_friend_male + friend_country_cnt
+ songsListened + lovedTracks + posts + playlists
+ shouts + tenure + good_country,
method = "nearest", data = Highnote_nomiss)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
get some information about how successful the matching was:
summary(mod_match)
##
## Call:
## matchit(formula = ynsf ~ age + friend_cnt + avg_friend_age +
## avg_friend_male + friend_country_cnt + songsListened + lovedTracks +
## posts + playlists + shouts + tenure + good_country, data = Highnote_nomiss,
## method = "nearest")
##
## Summary of balance for all data:
## Means Treated Means Control SD Control Mean Diff
## distance 0.4635 0.1550 0.1436 0.3085
## age 25.3732 23.7476 6.2245 1.6256
## friend_cnt 54.0210 10.4313 15.2769 43.5896
## avg_friend_age 25.3904 23.7614 5.0577 1.6291
## avg_friend_male 0.6358 0.6131 0.3343 0.0227
## friend_country_cnt 9.3856 2.7251 3.1024 6.6606
## songsListened 33735.6404 14602.2205 23214.2898 19133.4199
## lovedTracks 225.3647 65.2137 181.4812 160.1510
## posts 20.5230 2.5434 33.7947 17.9796
## playlists 0.7441 0.5295 0.9673 0.2146
## shouts 101.8195 16.4230 79.7381 85.3965
## tenure 46.5487 43.2027 19.7212 3.3460
## good_country 0.3433 0.3547 0.4784 -0.0114
## eQQ Med eQQ Mean eQQ Max
## distance 0.2510 0.3085 0.6843
## age 1.0000 1.6296 5.0000
## friend_cnt 22.0000 43.5838 4794.0000
## avg_friend_age 1.5909 1.6369 11.5000
## avg_friend_male 0.0738 0.0958 0.3636
## friend_country_cnt 5.0000 6.6598 95.0000
## songsListened 15471.0000 19126.1623 653702.0000
## lovedTracks 65.0000 159.9562 6343.0000
## posts 0.0000 17.8829 9535.0000
## playlists 0.0000 0.2092 26.0000
## shouts 15.0000 85.1764 59168.0000
## tenure 3.0000 3.3473 10.0000
## good_country 0.0000 0.0114 1.0000
##
##
## Summary of balance for matched data:
## Means Treated Means Control SD Control Mean Diff
## distance 0.4635 0.3040 0.1914 0.1595
## age 25.3732 26.4180 8.0005 -1.0448
## friend_cnt 54.0210 21.4052 23.5586 32.6158
## avg_friend_age 25.3904 26.5864 6.7140 -1.1960
## avg_friend_male 0.6358 0.6557 0.2647 -0.0199
## friend_country_cnt 9.3856 5.0768 4.6543 4.3089
## songsListened 33735.6404 27219.9089 33842.4008 6515.7315
## lovedTracks 225.3647 134.9342 299.0241 90.4304
## posts 20.5230 6.2695 60.7689 14.2535
## playlists 0.7441 0.6720 1.3948 0.0721
## shouts 101.8195 37.2356 138.7147 64.5839
## tenure 46.5487 47.6901 19.0755 -1.1414
## good_country 0.3433 0.3615 0.4805 -0.0182
## eQQ Med eQQ Mean eQQ Max
## distance 0.1087 0.1595 0.4520
## age 1.0000 1.0448 8.0000
## friend_cnt 12.0000 32.6158 4794.0000
## avg_friend_age 0.5000 1.2839 14.0000
## avg_friend_male 0.0147 0.0329 0.1642
## friend_country_cnt 2.0000 4.3089 95.0000
## songsListened 4904.0000 6515.7315 566867.0000
## lovedTracks 38.0000 90.4304 6180.0000
## posts 0.0000 14.2535 9535.0000
## playlists 0.0000 0.1106 22.0000
## shouts 10.0000 64.5839 59168.0000
## tenure 1.0000 1.2792 4.0000
## good_country 0.0000 0.0182 1.0000
##
## Percent Balance Improvement:
## Mean Diff. eQQ Med eQQ Mean eQQ Max
## distance 48.2996 56.7139 48.2975 33.9443
## age 35.7306 0.0000 35.8883 -60.0000
## friend_cnt 25.1753 45.4545 25.1654 0.0000
## avg_friend_age 26.5857 68.5714 21.5672 -21.7391
## avg_friend_male 12.2518 80.0840 65.7176 54.8507
## friend_country_cnt 35.3078 60.0000 35.3001 0.0000
## songsListened 65.9458 68.3020 65.9329 13.2836
## lovedTracks 43.5343 41.5385 43.4655 2.5698
## posts 20.7240 0.0000 20.2956 0.0000
## playlists 66.4144 0.0000 47.1533 15.3846
## shouts 24.3717 33.3333 24.1763 0.0000
## tenure 65.8879 66.6667 61.7834 60.0000
## good_country -59.6007 0.0000 -59.8214 0.0000
##
## Sample sizes:
## Control Treated
## All 34004 9823
## Matched 9823 9823
## Unmatched 24181 0
## Discarded 0 0
plot(mod_match)
create a dataframe containing only the matched observations:
data_m <- match.data(mod_match)
dim(data_m)
## [1] 19646 17
The final dataset is smaller than the original: it contains 19646 observations, meaning that 9823 pairs of treated and control observations were matched. The final dataset contains a variable called distance, which is the propensity score.
fin_bal <- function(data, variable) {
data$variable <- data[, variable]
if (variable == 'songsListened') data$variable <- data$variable / 10^3
data$ynsf <- as.factor(data$ynsf)
support <- c(min(data$variable), max(data$variable))
ggplot(data, aes(x = distance, y = variable, color = ynsf)) +
geom_point(alpha = 0.2, size = 1.3) +
geom_smooth(method = "loess", se = F) +
xlab("Propensity score") +
ylab(variable) +
theme_bw() +
ylim(support)
}
grid.arrange(
fin_bal(data_m, "age"),
fin_bal(data_m, "friend_cnt") + theme(legend.position = "none"),
fin_bal(data_m, "avg_friend_age"),
fin_bal(data_m, "avg_friend_male") + theme(legend.position = "none"),
fin_bal(data_m, "friend_country_cnt"),
fin_bal(data_m, "songsListened") + theme(legend.position = "none"),
fin_bal(data_m, "lovedTracks"),
nrow =4, widths = c(1, 0.8)
)
grid.arrange(
fin_bal(data_m, "lovedTracks"),
fin_bal(data_m, "posts") + theme(legend.position = "none"),
fin_bal(data_m, "playlists"),
fin_bal(data_m, "shouts") + theme(legend.position = "none"),
fin_bal(data_m, "tenure"),
fin_bal(data_m, "good_country"),
nrow = 3, widths = c(1, 0.8)
)
## Warning: Removed 4 rows containing missing values (geom_smooth).
data_m %>%
group_by(ynsf) %>%
select(one_of(hn_cov2)) %>%
summarise_all(funs(mean))
## Adding missing grouping variables: `ynsf`
lapply(hn_cov2, function(v) {
t.test(data_m[, v] ~ data_m$ynsf)
})
## [[1]]
##
## Welch Two Sample t-test
##
## data: data_m[, v] by data_m$ynsf
## t = 9.7592, df = 19282, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.8349505 1.2546352
## sample estimates:
## mean in group 0 mean in group 1
## 26.41800 25.37321
##
##
## [[2]]
##
## Welch Two Sample t-test
##
## data: data_m[, v] by data_m$ynsf
## t = 1.7116, df = 19643, p-value = 0.08699
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.001699904 0.025114339
## sample estimates:
## mean in group 0 mean in group 1
## 0.6479691 0.6362618
##
##
## [[3]]
##
## Welch Two Sample t-test
##
## data: data_m[, v] by data_m$ynsf
## t = -24.855, df = 10488, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -35.18808 -30.04352
## sample estimates:
## mean in group 0 mean in group 1
## 21.40517 54.02097
##
##
## [[4]]
##
## Welch Two Sample t-test
##
## data: data_m[, v] by data_m$ynsf
## t = 13.992, df = 18434, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.028425 1.363504
## sample estimates:
## mean in group 0 mean in group 1
## 26.58639 25.39043
##
##
## [[5]]
##
## Welch Two Sample t-test
##
## data: data_m[, v] by data_m$ynsf
## t = 5.6307, df = 19263, p-value = 1.82e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.01298231 0.02684722
## sample estimates:
## mean in group 0 mean in group 1
## 0.6557225 0.6358077
##
##
## [[6]]
##
## Welch Two Sample t-test
##
## data: data_m[, v] by data_m$ynsf
## t = -38.685, df = 13879, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.527193 -4.090541
## sample estimates:
## mean in group 0 mean in group 1
## 5.076759 9.385626
##
##
## [[7]]
##
## Welch Two Sample t-test
##
## data: data_m[, v] by data_m$ynsf
## t = -11.642, df = 18439, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -7612.782 -5418.681
## sample estimates:
## mean in group 0 mean in group 1
## 27219.91 33735.64
##
##
## [[8]]
##
## Welch Two Sample t-test
##
## data: data_m[, v] by data_m$ynsf
## t = -15.424, df = 16085, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -101.9222 -78.9386
## sample estimates:
## mean in group 0 mean in group 1
## 134.9342 225.3647
##
##
## [[9]]
##
## Welch Two Sample t-test
##
## data: data_m[, v] by data_m$ynsf
## t = -5.6778, df = 11063, p-value = 1.399e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -19.17429 -9.33268
## sample estimates:
## mean in group 0 mean in group 1
## 6.26947 20.52296
##
##
## [[10]]
##
## Welch Two Sample t-test
##
## data: data_m[, v] by data_m$ynsf
## t = -2.9701, df = 17743, p-value = 0.002981
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.11964186 -0.02450962
## sample estimates:
## mean in group 0 mean in group 1
## 0.6719943 0.7440700
##
##
## [[11]]
##
## Welch Two Sample t-test
##
## data: data_m[, v] by data_m$ynsf
## t = -8.5073, df = 10512, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -79.46493 -49.70295
## sample estimates:
## mean in group 0 mean in group 1
## 37.23557 101.81951
##
##
## [[12]]
##
## Welch Two Sample t-test
##
## data: data_m[, v] by data_m$ynsf
## t = 4.1015, df = 19607, p-value = 4.121e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.5959372 1.6868684
## sample estimates:
## mean in group 0 mean in group 1
## 47.69012 46.54871
##
##
## [[13]]
##
## Welch Two Sample t-test
##
## data: data_m[, v] by data_m$ynsf
## t = 2.6737, df = 19641, p-value = 0.007509
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.004863394 0.031581684
## sample estimates:
## mean in group 0 mean in group 1
## 0.3614985 0.3432760
Estimating treatment effects: Estimating the treatment effect is simple once we have a matched sample that we are happy with. We can use a t-test:
with(data_m, t.test(adopter ~ ynsf))
##
## Welch Two Sample t-test
##
## data: adopter by ynsf
## t = -18.938, df = 18060, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.10009352 -0.08131745
## sample estimates:
## mean in group 0 mean in group 1
## 0.08683702 0.17754250
Here for matched data: adopter by ynsf, t = -18.938, comparing to before matching t = -30.961.
We can also do binomial regression:
glm_treat1 <- glm(adopter ~ ynsf, family = binomial(), data = data_m)
summary(glm_treat1)
##
## Call:
## glm(formula = adopter ~ ynsf, family = binomial(), data = data_m)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6252 -0.6252 -0.4262 -0.4262 2.2108
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.35288 0.03583 -65.67 <2e-16 ***
## ynsf 0.81979 0.04451 18.42 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 15345 on 19645 degrees of freedom
## Residual deviance: 14986 on 19644 degrees of freedom
## AIC: 14990
##
## Number of Fisher Scoring iterations: 5
glm_treat2 <- glm(adopter ~ ynsf + age + friend_cnt + avg_friend_age + avg_friend_male + friend_country_cnt
+ lovedTracks + posts + playlists
+ shouts + tenure + good_country
+ I(songsListened / 10^3), family = binomial(), data = data_m)
summary(glm_treat2)
##
## Call:
## glm(formula = adopter ~ ynsf + age + friend_cnt + avg_friend_age +
## avg_friend_male + friend_country_cnt + lovedTracks + posts +
## playlists + shouts + tenure + good_country + I(songsListened/10^3),
## family = binomial(), data = data_m)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2269 -0.5694 -0.4544 -0.3793 2.4961
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.188e+00 1.225e-01 -26.030 < 2e-16 ***
## ynsf 7.244e-01 4.675e-02 15.494 < 2e-16 ***
## age 1.897e-02 3.912e-03 4.850 1.23e-06 ***
## friend_cnt -1.054e-04 2.742e-04 -0.384 0.700784
## avg_friend_age 8.544e-03 5.308e-03 1.610 0.107470
## avg_friend_male 6.805e-02 9.266e-02 0.734 0.462696
## friend_country_cnt 5.273e-03 3.610e-03 1.461 0.144146
## lovedTracks 5.264e-04 4.687e-05 11.232 < 2e-16 ***
## posts 1.216e-04 8.881e-05 1.369 0.170870
## playlists 4.567e-02 1.216e-02 3.757 0.000172 ***
## shouts 9.836e-05 7.270e-05 1.353 0.176067
## tenure -1.997e-03 1.217e-03 -1.641 0.100842
## good_country -3.942e-01 4.809e-02 -8.198 2.44e-16 ***
## I(songsListened/10^3) 4.560e-03 5.293e-04 8.617 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 15345 on 19645 degrees of freedom
## Residual deviance: 14530 on 19632 degrees of freedom
## AIC: 14558
##
## Number of Fisher Scoring iterations: 5
After we eliminate the background variable differences for treatment and control group,(control for the differences). Having subscriber friends has higher probability of being adopter than don’t have subscriber friends
Now, we will use a logistic regression approach to test which variables (including subscriber friends) are significant for explaining the likelihood of becoming an adopter.
Before we fitting into the logistic regression model, let’s see the correlation between the predictors.
res2 <- cor(Highnote)
res2
## ID age male friend_cnt
## ID 1.000000000 0.037640058 0.016121071 0.042525111
## age 0.037640058 1.000000000 0.169075297 -0.033964722
## male 0.016121071 0.169075297 1.000000000 -0.004292194
## friend_cnt 0.042525111 -0.033964722 -0.004292194 1.000000000
## avg_friend_age 0.035581652 0.688102645 0.049227806 -0.051956870
## avg_friend_male 0.006456926 0.075157294 0.051880384 -0.009592896
## friend_country_cnt 0.063656866 -0.031242578 -0.042381576 0.718526722
## subscriber_friend_cnt 0.054907363 0.077454545 0.007156261 0.781243469
## songsListened 0.069063354 0.021965699 0.116487201 0.213134540
## lovedTracks 0.080093025 0.054059810 0.022807694 0.195800406
## posts 0.010076761 0.005055075 0.008733265 0.046903177
## playlists 0.034083872 0.112464340 -0.008456892 0.047125916
## shouts 0.023707630 -0.023501423 -0.017216850 0.195353714
## adopter 0.471166209 0.085879158 0.060513120 0.089397803
## tenure 0.006125125 0.300314069 0.093779479 -0.001366727
## good_country -0.003869528 0.097712013 0.001332812 -0.031759499
## ynsf 0.084054858 0.105352120 0.006414118 0.281218822
## songsListened_1k 0.069063354 0.021965699 0.116487201 0.213134540
## avg_friend_age avg_friend_male friend_country_cnt
## ID 0.035581652 0.0064569255 0.063656866
## age 0.688102645 0.0751572936 -0.031242578
## male 0.049227806 0.0518803840 -0.042381576
## friend_cnt -0.051956870 -0.0095928960 0.718526722
## avg_friend_age 1.000000000 0.1817757237 -0.037194340
## avg_friend_male 0.181775724 1.0000000000 -0.022870690
## friend_country_cnt -0.037194340 -0.0228706898 1.000000000
## subscriber_friend_cnt 0.062994976 0.0098483093 0.508548615
## songsListened 0.001649563 0.0195449919 0.328554333
## lovedTracks 0.043798816 -0.0006237683 0.308362521
## posts 0.006227368 0.0052870775 0.085229354
## playlists 0.104146564 -0.0025497456 0.095861697
## shouts -0.022668047 -0.0057908193 0.214654915
## adopter 0.075862799 0.0173318849 0.143259895
## tenure 0.317447021 0.0857552704 0.008143325
## good_country 0.094390733 0.0216240488 -0.051725026
## ynsf 0.132493714 0.0301354328 0.452729496
## songsListened_1k 0.001649563 0.0195449919 0.328554333
## subscriber_friend_cnt songsListened lovedTracks
## ID 0.054907363 0.069063354 0.0800930255
## age 0.077454545 0.021965699 0.0540598100
## male 0.007156261 0.116487201 0.0228076943
## friend_cnt 0.781243469 0.213134540 0.1958004056
## avg_friend_age 0.062994976 0.001649563 0.0437988161
## avg_friend_male 0.009848309 0.019544992 -0.0006237683
## friend_country_cnt 0.508548615 0.328554333 0.3083625207
## subscriber_friend_cnt 1.000000000 0.137199916 0.1762751646
## songsListened 0.137199916 1.000000000 0.2331350138
## lovedTracks 0.176275165 0.233135014 1.0000000000
## posts 0.054450057 0.089020150 0.0582214108
## playlists 0.082733792 0.074007584 0.1347662040
## shouts 0.137848108 0.130239814 0.0999825871
## adopter 0.115550333 0.145427363 0.1650097408
## tenure 0.018916949 0.241593913 0.0108400035
## good_country 0.008683892 0.027378568 0.0131821863
## ynsf 0.334185344 0.263814198 0.2282066299
## songsListened_1k 0.137199916 1.000000000 0.2331350138
## posts playlists shouts adopter
## ID 0.010076761 0.0340838716 0.023707630 0.47116621
## age 0.005055075 0.1124643397 -0.023501423 0.08587916
## male 0.008733265 -0.0084568920 -0.017216850 0.06051312
## friend_cnt 0.046903177 0.0471259162 0.195353714 0.08939780
## avg_friend_age 0.006227368 0.1041465637 -0.022668047 0.07586280
## avg_friend_male 0.005287077 -0.0025497456 -0.005790819 0.01733188
## friend_country_cnt 0.085229354 0.0958616970 0.214654915 0.14325989
## subscriber_friend_cnt 0.054450057 0.0827337920 0.137848108 0.11555033
## songsListened 0.089020150 0.0740075837 0.130239814 0.14542736
## lovedTracks 0.058221411 0.1347662040 0.099982587 0.16500974
## posts 1.000000000 0.0120232306 0.122411385 0.03658764
## playlists 0.012023231 1.0000000000 0.015808382 0.07572234
## shouts 0.122411385 0.0158083825 1.000000000 0.05266217
## adopter 0.036587643 0.0757223372 0.052662169 1.00000000
## tenure 0.039933777 0.0716343706 0.021943838 0.02434451
## good_country -0.001882863 -0.0008543083 -0.017101503 -0.04003519
## ynsf 0.063393565 0.0708736019 0.099241045 0.19178523
## songsListened_1k 0.089020150 0.0740075837 0.130239814 0.14542736
## tenure good_country ynsf
## ID 0.006125125 -0.0038695281 0.084054858
## age 0.300314069 0.0977120133 0.105352120
## male 0.093779479 0.0013328121 0.006414118
## friend_cnt -0.001366727 -0.0317594993 0.281218822
## avg_friend_age 0.317447021 0.0943907326 0.132493714
## avg_friend_male 0.085755270 0.0216240488 0.030135433
## friend_country_cnt 0.008143325 -0.0517250258 0.452729496
## subscriber_friend_cnt 0.018916949 0.0086838916 0.334185344
## songsListened 0.241593913 0.0273785683 0.263814198
## lovedTracks 0.010840004 0.0131821863 0.228206630
## posts 0.039933777 -0.0018828628 0.063393565
## playlists 0.071634371 -0.0008543083 0.070873602
## shouts 0.021943838 -0.0171015027 0.099241045
## adopter 0.024344506 -0.0400351891 0.191785227
## tenure 1.000000000 0.1320492934 0.070417882
## good_country 0.132049293 1.0000000000 -0.009968337
## ynsf 0.070417882 -0.0099683365 1.000000000
## songsListened_1k 0.241593913 0.0273785683 0.263814198
## songsListened_1k
## ID 0.069063354
## age 0.021965699
## male 0.116487201
## friend_cnt 0.213134540
## avg_friend_age 0.001649563
## avg_friend_male 0.019544992
## friend_country_cnt 0.328554333
## subscriber_friend_cnt 0.137199916
## songsListened 1.000000000
## lovedTracks 0.233135014
## posts 0.089020150
## playlists 0.074007584
## shouts 0.130239814
## adopter 0.145427363
## tenure 0.241593913
## good_country 0.027378568
## ynsf 0.263814198
## songsListened_1k 1.000000000
round(res2, 4)
## ID age male friend_cnt avg_friend_age
## ID 1.0000 0.0376 0.0161 0.0425 0.0356
## age 0.0376 1.0000 0.1691 -0.0340 0.6881
## male 0.0161 0.1691 1.0000 -0.0043 0.0492
## friend_cnt 0.0425 -0.0340 -0.0043 1.0000 -0.0520
## avg_friend_age 0.0356 0.6881 0.0492 -0.0520 1.0000
## avg_friend_male 0.0065 0.0752 0.0519 -0.0096 0.1818
## friend_country_cnt 0.0637 -0.0312 -0.0424 0.7185 -0.0372
## subscriber_friend_cnt 0.0549 0.0775 0.0072 0.7812 0.0630
## songsListened 0.0691 0.0220 0.1165 0.2131 0.0016
## lovedTracks 0.0801 0.0541 0.0228 0.1958 0.0438
## posts 0.0101 0.0051 0.0087 0.0469 0.0062
## playlists 0.0341 0.1125 -0.0085 0.0471 0.1041
## shouts 0.0237 -0.0235 -0.0172 0.1954 -0.0227
## adopter 0.4712 0.0859 0.0605 0.0894 0.0759
## tenure 0.0061 0.3003 0.0938 -0.0014 0.3174
## good_country -0.0039 0.0977 0.0013 -0.0318 0.0944
## ynsf 0.0841 0.1054 0.0064 0.2812 0.1325
## songsListened_1k 0.0691 0.0220 0.1165 0.2131 0.0016
## avg_friend_male friend_country_cnt
## ID 0.0065 0.0637
## age 0.0752 -0.0312
## male 0.0519 -0.0424
## friend_cnt -0.0096 0.7185
## avg_friend_age 0.1818 -0.0372
## avg_friend_male 1.0000 -0.0229
## friend_country_cnt -0.0229 1.0000
## subscriber_friend_cnt 0.0098 0.5085
## songsListened 0.0195 0.3286
## lovedTracks -0.0006 0.3084
## posts 0.0053 0.0852
## playlists -0.0025 0.0959
## shouts -0.0058 0.2147
## adopter 0.0173 0.1433
## tenure 0.0858 0.0081
## good_country 0.0216 -0.0517
## ynsf 0.0301 0.4527
## songsListened_1k 0.0195 0.3286
## subscriber_friend_cnt songsListened lovedTracks
## ID 0.0549 0.0691 0.0801
## age 0.0775 0.0220 0.0541
## male 0.0072 0.1165 0.0228
## friend_cnt 0.7812 0.2131 0.1958
## avg_friend_age 0.0630 0.0016 0.0438
## avg_friend_male 0.0098 0.0195 -0.0006
## friend_country_cnt 0.5085 0.3286 0.3084
## subscriber_friend_cnt 1.0000 0.1372 0.1763
## songsListened 0.1372 1.0000 0.2331
## lovedTracks 0.1763 0.2331 1.0000
## posts 0.0545 0.0890 0.0582
## playlists 0.0827 0.0740 0.1348
## shouts 0.1378 0.1302 0.1000
## adopter 0.1156 0.1454 0.1650
## tenure 0.0189 0.2416 0.0108
## good_country 0.0087 0.0274 0.0132
## ynsf 0.3342 0.2638 0.2282
## songsListened_1k 0.1372 1.0000 0.2331
## posts playlists shouts adopter tenure
## ID 0.0101 0.0341 0.0237 0.4712 0.0061
## age 0.0051 0.1125 -0.0235 0.0859 0.3003
## male 0.0087 -0.0085 -0.0172 0.0605 0.0938
## friend_cnt 0.0469 0.0471 0.1954 0.0894 -0.0014
## avg_friend_age 0.0062 0.1041 -0.0227 0.0759 0.3174
## avg_friend_male 0.0053 -0.0025 -0.0058 0.0173 0.0858
## friend_country_cnt 0.0852 0.0959 0.2147 0.1433 0.0081
## subscriber_friend_cnt 0.0545 0.0827 0.1378 0.1156 0.0189
## songsListened 0.0890 0.0740 0.1302 0.1454 0.2416
## lovedTracks 0.0582 0.1348 0.1000 0.1650 0.0108
## posts 1.0000 0.0120 0.1224 0.0366 0.0399
## playlists 0.0120 1.0000 0.0158 0.0757 0.0716
## shouts 0.1224 0.0158 1.0000 0.0527 0.0219
## adopter 0.0366 0.0757 0.0527 1.0000 0.0243
## tenure 0.0399 0.0716 0.0219 0.0243 1.0000
## good_country -0.0019 -0.0009 -0.0171 -0.0400 0.1320
## ynsf 0.0634 0.0709 0.0992 0.1918 0.0704
## songsListened_1k 0.0890 0.0740 0.1302 0.1454 0.2416
## good_country ynsf songsListened_1k
## ID -0.0039 0.0841 0.0691
## age 0.0977 0.1054 0.0220
## male 0.0013 0.0064 0.1165
## friend_cnt -0.0318 0.2812 0.2131
## avg_friend_age 0.0944 0.1325 0.0016
## avg_friend_male 0.0216 0.0301 0.0195
## friend_country_cnt -0.0517 0.4527 0.3286
## subscriber_friend_cnt 0.0087 0.3342 0.1372
## songsListened 0.0274 0.2638 1.0000
## lovedTracks 0.0132 0.2282 0.2331
## posts -0.0019 0.0634 0.0890
## playlists -0.0009 0.0709 0.0740
## shouts -0.0171 0.0992 0.1302
## adopter -0.0400 0.1918 0.1454
## tenure 0.1320 0.0704 0.2416
## good_country 1.0000 -0.0100 0.0274
## ynsf -0.0100 1.0000 0.2638
## songsListened_1k 0.0274 0.2638 1.0000
corrplot(res2)
Based on the analysis, we find that the following varaibles are relatively highly correlated: age & avg_friend_age; male & avg_friend_male; friend_cnt & friend_country_cnt; friend_cnt & subscriber_friend_cnt; friend_country_cnt & subscriber_firend_cnt.
In order to build a better regression model, we should not use independent variables which are relatively highly correlated. Let’s see what it shows when putting all the variables into the model.
mod.fit1 <- glm(adopter ~ age + male + friend_cnt + avg_friend_age + avg_friend_male + friend_country_cnt
+ subscriber_friend_cnt + lovedTracks + posts + playlists + songsListened_1k
+ shouts + tenure + good_country, family = binomial(), data = Highnote)
summary(mod.fit1)
##
## Call:
## glm(formula = adopter ~ age + male + friend_cnt + avg_friend_age +
## avg_friend_male + friend_country_cnt + subscriber_friend_cnt +
## lovedTracks + posts + playlists + songsListened_1k + shouts +
## tenure + good_country, family = binomial(), data = Highnote)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.3526 -0.4114 -0.3500 -0.2913 2.7018
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.179e+00 9.571e-02 -43.665 < 2e-16 ***
## age 1.962e-02 3.478e-03 5.641 1.69e-08 ***
## male 4.133e-01 4.169e-02 9.913 < 2e-16 ***
## friend_cnt -4.312e-03 4.920e-04 -8.765 < 2e-16 ***
## avg_friend_age 2.954e-02 4.484e-03 6.588 4.45e-11 ***
## avg_friend_male 1.162e-01 6.346e-02 1.831 0.0671 .
## friend_country_cnt 4.326e-02 3.616e-03 11.962 < 2e-16 ***
## subscriber_friend_cnt 9.132e-02 1.073e-02 8.512 < 2e-16 ***
## lovedTracks 6.950e-04 4.933e-05 14.088 < 2e-16 ***
## posts 8.492e-05 9.580e-05 0.886 0.3754
## playlists 5.920e-02 1.333e-02 4.441 8.97e-06 ***
## songsListened_1k 7.626e-03 5.192e-04 14.687 < 2e-16 ***
## shouts 1.108e-04 8.428e-05 1.314 0.1887
## tenure -4.476e-03 1.022e-03 -4.380 1.19e-05 ***
## good_country -4.152e-01 4.078e-02 -10.181 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24537 on 43826 degrees of freedom
## Residual deviance: 22613 on 43812 degrees of freedom
## AIC: 22643
##
## Number of Fisher Scoring iterations: 5
Multicollinearity can be detected using a statistic called the variance inflation factor (VIF). For any predictor variable, the square root of the VIF indicates the degree to which the confidence interval for that variable’s regression parameter is expanded relative to a model with uncorrelated predictors (hence the name). VIF values are pro- vided by the vif() function in the car package. As a general rule, sqrt(vif) > 2 indicates a multicollinearity problem.
vif(mod.fit1)
## age male friend_cnt
## 2.028083 1.061966 4.295009
## avg_friend_age avg_friend_male friend_country_cnt
## 2.061113 1.042020 2.621221
## subscriber_friend_cnt lovedTracks posts
## 3.007514 1.150339 1.088116
## playlists songsListened_1k shouts
## 1.044297 1.280630 1.337860
## tenure good_country
## 1.213634 1.029508
sqrt(vif(mod.fit1)) > 2
## age male friend_cnt
## FALSE FALSE TRUE
## avg_friend_age avg_friend_male friend_country_cnt
## FALSE FALSE FALSE
## subscriber_friend_cnt lovedTracks posts
## FALSE FALSE FALSE
## playlists songsListened_1k shouts
## FALSE FALSE FALSE
## tenure good_country
## FALSE FALSE
outlierTest(mod.fit1)
## rstudent unadjusted p-value Bonferonni p
## 32663 -5.837848 5.2879e-09 0.00023175
The results indicate that variable friend_cnt has a multicollinearity problem with these predictor variables. We’ll take out this variable to further analyze. Further, based on the mean analysis graph, logical assumption, and mod.fit1, we choose to include variable the following model: age, subscriber_friend_cnt, lovedTracks, playlists, songsListened_1k, good_country
mod.fit2 <- glm(adopter ~ age + subscriber_friend_cnt
+ lovedTracks + playlists + songsListened_1k
+ good_country, family = binomial(), data = Highnote)
summary(mod.fit2)
##
## Call:
## glm(formula = adopter ~ age + subscriber_friend_cnt + lovedTracks +
## playlists + songsListened_1k + good_country, family = binomial(),
## data = Highnote)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7.3540 -0.4065 -0.3553 -0.3124 2.6222
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.613e+00 6.537e-02 -55.275 < 2e-16 ***
## age 3.627e-02 2.449e-03 14.815 < 2e-16 ***
## subscriber_friend_cnt 9.476e-02 8.250e-03 11.487 < 2e-16 ***
## lovedTracks 7.808e-04 4.923e-05 15.859 < 2e-16 ***
## playlists 6.589e-02 1.352e-02 4.874 1.09e-06 ***
## songsListened_1k 8.306e-03 4.757e-04 17.460 < 2e-16 ***
## good_country -4.408e-01 4.044e-02 -10.902 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24537 on 43826 degrees of freedom
## Residual deviance: 22880 on 43820 degrees of freedom
## AIC: 22894
##
## Number of Fisher Scoring iterations: 5
vif(mod.fit2)
## age subscriber_friend_cnt lovedTracks
## 1.041901 1.125002 1.121123
## playlists songsListened_1k good_country
## 1.038990 1.086087 1.018351
sqrt(vif(mod.fit2)) > 2
## age subscriber_friend_cnt lovedTracks
## FALSE FALSE FALSE
## playlists songsListened_1k good_country
## FALSE FALSE FALSE
outlierTest(mod.fit2)
## rstudent unadjusted p-value Bonferonni p
## 32663 -7.781185 7.1848e-15 3.1489e-10
## 21293 -6.125326 9.0498e-10 3.9663e-05
## 10623 -4.906967 9.2495e-07 4.0538e-02
Note that the model is no longer suffered from multicollinearity problem, but still, we have some outliers, we will delete these outliers from the data set, and do a regression based on the new data set.
HighnoteNew <- Highnote[c(-32663,-21293,-10623,-37360,-3364,-12898,-27575,-30653,-12277),]
mod.fit3 <- glm(adopter ~ age + subscriber_friend_cnt*age
+ lovedTracks + playlists + songsListened_1k
+ good_country, family = binomial(), data = HighnoteNew)
summary(mod.fit3)
##
## Call:
## glm(formula = adopter ~ age + subscriber_friend_cnt * age + lovedTracks +
## playlists + songsListened_1k + good_country, family = binomial(),
## data = HighnoteNew)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.0100 -0.4036 -0.3467 -0.3041 2.6563
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.813e+00 6.907e-02 -55.207 < 2e-16 ***
## age 4.248e-02 2.551e-03 16.655 < 2e-16 ***
## subscriber_friend_cnt 3.508e-01 2.523e-02 13.904 < 2e-16 ***
## lovedTracks 7.822e-04 4.975e-05 15.723 < 2e-16 ***
## playlists 6.982e-02 1.364e-02 5.118 3.1e-07 ***
## songsListened_1k 7.480e-03 4.804e-04 15.569 < 2e-16 ***
## good_country -4.267e-01 4.060e-02 -10.510 < 2e-16 ***
## age:subscriber_friend_cnt -6.920e-03 7.786e-04 -8.888 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24535 on 43817 degrees of freedom
## Residual deviance: 22580 on 43810 degrees of freedom
## AIC: 22596
##
## Number of Fisher Scoring iterations: 5
The AIC changed from 22894 to 22596, indicating it’s a better model.
The expected variance for data drawn from a binomial distribution is σ2 = nπ(1 − π), where n is the number of observations and π is the probability of belonging to the Y = 1 group. Overdispersion occurs when the observed variance of the response variable is larger than what would be expected from a binomial distribution. Overdispersion can lead to distorted test standard errors and inaccurate tests of significance. We can also test if there is an overdispersion problem with the model using the following code:
deviance(mod.fit3)/df.residual(mod.fit3)
## [1] 0.5153999
With logistic regression, overdispersion is suggested if the ratio of the residual deviance to the residual degrees of freedom is much larger than 1, which is not our case here.
By looking at p-value, all the variables, including the intercept, are significant with p-value less than 0.01. Let’s look at the regression coefficients:
coef(mod.fit3)
## (Intercept) age
## -3.8133120779 0.0424823372
## subscriber_friend_cnt lovedTracks
## 0.3507614447 0.0007821799
## playlists songsListened_1k
## 0.0698206735 0.0074795246
## good_country age:subscriber_friend_cnt
## -0.4266934107 -0.0069202213
In a logistic regression, the response being modeled is the log(odds) that Y = 1. The regression coefficients give the change in log(odds) in the response for a unit change in the predictor variable, holding all other predictor variables constant. Because log(odds) are difficult to interpret, we can exponentiate them to put the results on an odds scale:
exp(coef(mod.fit3))
## (Intercept) age
## 0.02207494 1.04339763
## subscriber_friend_cnt lovedTracks
## 1.42014850 1.00078249
## playlists songsListened_1k
## 1.07231587 1.00750757
## good_country age:subscriber_friend_cnt
## 0.65266362 0.99310367
Now we can see that the odds of a fee-user conversion are increased by a factor of 1.00078249 for a one-unit increase in ‘lovedTracks’, (holding ‘subscriber_friend_cnt’, ‘lovedTracks’, ‘playlists’, ‘songsListened_1k’, ‘good_country’ constant). Conversely, the odds of a fee-user conversion are multiplied by a factor of 0.0007821799 for a one-unit increase in ‘lovedTracks’.
The odds of a fee-user conversion increase with ‘age’, ‘subscriber_friend_cnt’, ‘lovedTracks’, ‘playlists’, ‘songsListened_1k’, and decrease with ‘good_country’, ‘age:subscriber_friend_cnt’.
A negative interaction coefficient in ‘age:subscriber_friend_cnt’ means that the effect of the combined action of two predictors is less then the sum of the individual effects.
Because the predictor variables can’t equal 0, the intercept isn’t meaningful in this case.
From my analysis. The results inform a “free-to-fee” strategy for High Note as follows:
When the company trying to put money in converting fee-users, try to: